# OTU table
otu_table = read_tsv("../data/otu-table.tsv", skip = 1)
otu_id = otu_table$`#OTU ID`
otu_table = data.frame(otu_table[, -1], check.names = FALSE, row.names = otu_id)
# Taxonomy table
tax = read_tsv("../data/taxonomy.tsv")
otu_id = tax$`Feature ID`
tax = data.frame(tax[, - c(1, 3)], row.names = otu_id)
tax = tax %>% separate(col = Taxon,
into = c("Kingdom", "Phylum", "Class", "Order",
"Family", "Genus", "Species"),
sep = ";")
for (i in 1:ncol(tax)) {
tax[, i] = sapply(tax[, i], function(x) str_split(x, "__")[[1]][2])
}
tax = as.matrix(tax)
tax[tax == ""] = NA
# Tree
tree = read_tree("../data/tree.nwk")
# Meta data
meta_data = read_tsv("../data/metadata.tsv")
meta_data = meta_data %>%
mutate_if(is.character, as.factor)
meta_data$sampleid = as.character(meta_data$sampleid)
meta_data$caregiver_stress_level = factor(meta_data$caregiver_stress_level,
levels = c("Low", "Medium", "High"))
meta_data$depression_level = factor(meta_data$depression_level,
levels = c("Low", "Medium", "High"))
meta_data$hostility_level = factor(meta_data$hostility_level,
levels = c("Low", "Medium", "High"))
meta_data$das_level = factor(meta_data$das_level,
levels = c("Low", "Medium", "High"))
meta_data$metabolic_syndrome_level = factor(meta_data$metabolic_syndrome_level,
levels = c("No or Mild",
"Moderate",
"Severe"))
OTU = otu_table(otu_table, taxa_are_rows = TRUE)
META = sample_data(meta_data)
sample_names(META) = meta_data$sampleid
TAX = tax_table(tax)
otu_data = phyloseq(OTU, TAX, META, tree)
pipeline = function(pseq, sample_id, adj_formula, group){
feature_table = abundances(pseq)
meta_data = meta(pseq)
p_adj_method = "BH"; zero_cut = 0.90; lib_cut = 1000
struc_zero = TRUE; neg_lb = FALSE; tol = 1e-5
max_iter = 100; conserve = FALSE; alpha = 0.05
per_num = 1000; global = FALSE; direct = TRUE
dunnett = FALSE; pattern = NULL
out = ANCOM_BC(feature_table, meta_data, sample_id, adj_formula, p_adj_method,
zero_cut, lib_cut, struc_zero, neg_lb, group, tol, max_iter,
conserve, alpha, per_num, global, direct, dunnett, pattern)
if(is.null(out$res_direct)) {
res = out$res
} else {
res = out$res_direct
}
res_beta = data.frame(res$beta * res$diff_abn, check.names = FALSE) %>%
rownames_to_column("taxon_id")
res_se = data.frame(res$se * res$diff_abn, check.names = FALSE) %>%
rownames_to_column("taxon_id")
colnames(res_se)[-1] = paste0(colnames(res_se)[-1], "SD")
res_model = res_beta %>%
left_join(res_se, by = "taxon_id") %>%
dplyr::select(-matches("Intercept"))
# Heatmap
df_fig = res_beta %>%
dplyr::select(-matches("Intercept"))
level_ch = levels(as.factor(meta_data[, group]))
col_label = outer(level_ch, level_ch, function(x, y) paste0(x, " - ", y))
col_label = col_label[lower.tri(col_label)]
colnames(df_fig) = c("taxon_id", col_label)
df_fig_long = df_fig %>%
gather(key = "key", value = "value", -taxon_id)
df_fig_long$taxon_id = factor(df_fig_long$taxon_id)
p_heat = df_fig_long %>%
ggplot(aes(x = key, y = taxon_id)) +
scale_x_discrete(expand = c(0, 0)) +
labs(x = NULL, y = NULL, fill = "Log fold change") +
geom_tile(aes(fill = value)) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red") +
theme_bw() +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
# Relative abundance plot
df_fig_long = data.frame(t(abundances(pseq)), check.names = FALSE) %>%
rownames_to_column(sample_id) %>%
pivot_longer(-sample_id, names_to = "taxon", values_to = "value") %>%
left_join(meta_data %>%
dplyr::select(sample_id, group))
colnames(df_fig_long)[ncol(df_fig_long)] = "group"
df_fig_wide = df_fig_long %>%
filter(!is.na(group)) %>%
group_by(group, taxon) %>%
summarise(value = sum(value)) %>%
pivot_wider(id_cols = "taxon",
names_from = "group",
values_from = "value") %>%
mutate(total = rowSums(select_if(., is.numeric))) %>%
mutate_if(is.numeric, function(x) 100 * x/sum(x)) %>%
mutate(taxon = case_when(
taxon == "Unknown" ~ "Others",
total < 2 ~ "Others",
TRUE ~ taxon)) %>%
group_by(taxon) %>%
summarise_all(sum)
df_fig = df_fig_wide %>%
dplyr::select(-total) %>%
pivot_longer(-taxon, names_to = "group", values_to = "value")
df_fig$taxon = forcats::fct_relevel(df_fig$taxon, "Others", after = Inf)
p_rel = df_fig %>%
ggplot(aes(x = group, y = value, fill = taxon)) +
geom_col(position = position_stack(), color = "black") +
labs(x = NULL, y = "Relative abundance (%)") +
theme_bw() +
scale_fill_discrete(name = NULL) +
theme(legend.position = "right",
axis.ticks.x = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
strip.background = element_rect(fill="white"))
obj = list(out = res_model, p_heat = p_heat, p_rel = p_rel)
return(obj)
}
1. Analyses at family level
# Aggregate taxa
family_data = aggregate_taxa(otu_data, "Family")
pseq = family_data
sample_id = "sampleid"
covariates = c("alcohol", "caregiver_stress_level", "depression_level",
"hostility_level", "das_level", "metabolic_syndrome_level")
labels = c("Alcohol Use", "Caregiver Stress Level", "Depression Level",
"Hostility Level", "DAS Level", "Metabolic Syndrome Level")
for (i in seq_along(covariates)) {
cat("\n \n \n")
covariate = covariates[i]
label = labels[i]
adj_formula = covariate
group = covariate
obj = pipeline(pseq, sample_id, adj_formula, group)
file_path = paste0("../outputs/ancombc_family_", covariate, ".csv")
res = obj$out
res[, -1] = signif(res[, -1], 3)
write_csv(res, file_path)
heat_path1 = paste0("../images/abundance/family_heatmap_", covariate, ".jpeg")
heat_path2 = paste0("../images/abundance/family_heatmap_", covariate, ".pdf")
p1 = obj$p_heat +
labs(title = label) +
theme(plot.title = element_text(hjust = 0.5))
print(p1)
cat("\n \n \n")
ggsave(filename = heat_path1, plot = p1, height = 5,
width = 6.25, units = 'in', dpi = 300)
ggsave(filename = heat_path2, plot = p1, height = 5, width = 6.25)
rel_path1 = paste0("../images/abundance/family_relative_", covariate, ".jpeg")
rel_path2 = paste0("../images/abundance/family_relative_", covariate, ".pdf")
p2 = obj$p_rel +
labs(title = label) +
scale_fill_brewer(palette = "Paired") +
theme(plot.title = element_text(hjust = 0.5))
print(p2)
cat("\n \n \n")
ggsave(filename = rel_path1, plot = p2, height = 5,
width = 6.25, units = 'in', dpi = 300)
ggsave(filename = rel_path2, plot = p2, height = 5, width = 6.25)
}












2. Analyses at genus level
# Aggregate taxa
genus_data = aggregate_taxa(otu_data, "Genus")
genus_data2 = merge_taxa2(genus_data, pattern = "\\Clostridium", name = "Clostridium")
pseq = genus_data2
sample_id = "sampleid"
covariates = c("alcohol", "caregiver_stress_level", "depression_level",
"hostility_level", "das_level", "metabolic_syndrome_level")
labels = c("Alcohol Use", "Caregiver Stress Level", "Depression Level",
"Hostility Level", "DAS Level", "Metabolic Syndrome Level")
for (i in seq_along(covariates)) {
cat("\n \n \n")
covariate = covariates[i]
label = labels[i]
adj_formula = covariate
group = covariate
obj = pipeline(pseq, sample_id, adj_formula, group)
file_path = paste0("../outputs/ancombc_genus_", covariate, ".csv")
res = obj$out
res[, -1] = signif(res[, -1], 3)
write_csv(res, file_path)
heat_path1 = paste0("../images/abundance/genus_heatmap_", covariate, ".jpeg")
heat_path2 = paste0("../images/abundance/genus_heatmap_", covariate, ".pdf")
p1 = obj$p_heat +
labs(title = label) +
theme(plot.title = element_text(hjust = 0.5))
print(p1)
cat("\n \n \n")
ggsave(filename = heat_path1, plot = p1, height = 5,
width = 6.25, units = 'in', dpi = 300)
ggsave(filename = heat_path2, plot = p1, height = 5, width = 6.25)
rel_path1 = paste0("../images/abundance/genus_relative_", covariate, ".jpeg")
rel_path2 = paste0("../images/abundance/genus_relative_", covariate, ".pdf")
p2 = obj$p_rel +
labs(title = label) +
theme(plot.title = element_text(hjust = 0.5))
print(p2)
cat("\n \n \n")
ggsave(filename = rel_path1, plot = p2, height = 5,
width = 6.25, units = 'in', dpi = 300)
ggsave(filename = rel_path2, plot = p2, height = 5, width = 6.25)
}











